Setup R env. Load packages and set default image export formats, size and resolution.
knitr::opts_chunk$set(echo = TRUE,
fig.height = 12,
fig.width = 12,
dev = c("png", "pdf"),
dpi = 1000)
library(ggridges)
library(ggplot2)
library(tibble)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
library(scales)
library(ggrepel)
library(stringr)
library(glue)
library(ggpubr)
options(scipen = 999) #Prevent scientific notation
text.scale <- 0.5
Load the cleaned and processed datasets into R for plotting.
data.selected.OGs <- read.table("combined_results/combined.stats.OGs.tsv", sep='\t',
header=TRUE, check.names=FALSE, stringsAsFactors=FALSE)
data.selected.OGs <- data.selected.OGs$orthogroup_id
data.OGs <- read.table("../06_Final_Classifications/Orthogroups.Run2.classification.tsv.gz", sep='\t',
header=TRUE, check.names=FALSE, stringsAsFactors=FALSE) %>%
mutate(selected=if_else(orthogroup_id %in% data.selected.OGs, "Yes", "No", NA))
data.seqs <- read.table("combined_results/combined.stats.ALL_seqs.info.tsv", sep='\t',
header=TRUE, check.names=FALSE, stringsAsFactors=FALSE) %>%
mutate(selected=if_else(orthogroup_id %in% data.selected.OGs, "Yes", "No", NA))
Plot number of CDS per proteins (for proteins with genome coords available) - ALL Designations.
col <- "num_CDS"
col.name <- "Number of CDS/exons per protein"
x.min <- 1
x.max <- 20
p1 <- merge(data.seqs,
data.OGs %>% select(orthogroup_id, designation),
all=TRUE,
by="orthogroup_id"
) %>%
select(!!sym(col), designation) %>%
rename(value = !!sym(col)) %>%
rename(group = designation) %>%
filter(! is.na(value)) %>%
mutate(value = if_else(value < x.min, x.min,
if_else(value > x.max, x.max, value)
)
) %>%
ggplot(aes(x=value, y=group, fill=group)) +
geom_density_ridges(scale = 0.95) +
theme_ridges() +
theme(legend.position = "none",
plot.title = element_text(size=14*text.scale, hjust = 0.5, face="bold"),
axis.title.x = element_text(size=10*text.scale, hjust = 0.5, face="bold"),
axis.title.y = element_blank(),
axis.text.x = element_text(size=8*text.scale),
axis.text.y = element_text(size=8*text.scale, angle = 90, vjust = 0.5, hjust=0, face="bold")
) +
scale_fill_cyclical(values = c("#984ea3", "#bebada", "#ff7f00", "#fdbf6f")) +
labs(title="Distribution of CDS/exon count per gene",
x=col.name,
y="Orthogorup Designation") +
scale_x_continuous(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0))
p1
## Picking joint bandwidth of 0.204
Plot number of CDS per proteins (for proteins with genome coords available) - ALL Designations.
col <- "PEP_length"
col.name <- "Protein length (aa)"
x.min <- 1
x.max <- 1000
p2 <- merge(data.seqs,
data.OGs %>% select(orthogroup_id, designation),
all=TRUE,
by="orthogroup_id"
) %>%
select(!!sym(col), designation) %>%
rename(value = !!sym(col)) %>%
rename(group = designation) %>%
filter(! is.na(value)) %>%
mutate(value = if_else(value < x.min, x.min,
if_else(value > x.max, x.max, value)
)
) %>%
ggplot(aes(x=value, y=group, fill=group)) +
geom_density_ridges(scale = 0.95) +
theme_ridges() +
theme(legend.position = "none",
plot.title = element_text(size=14*text.scale, hjust = 0.5, face="bold"),
axis.title.x = element_text(size=10*text.scale, hjust = 0.5, face="bold"),
axis.title.y = element_blank(),
axis.text.x = element_text(size=8*text.scale),
axis.text.y = element_text(size=8*text.scale, angle = 90, vjust = 0.5, hjust=0, face="bold")
) +
scale_fill_cyclical(values = c("#984ea3", "#bebada", "#ff7f00", "#fdbf6f")) +
labs(title="Distribution of protein lengths",
x=col.name,
y="Orthogorup Designation") +
scale_x_continuous(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0))
p2
## Picking joint bandwidth of 11.5
Plot length of proteins.
col <- "num_CDS"
col.name <- "Number of CDS/exons per protein"
x.min <- 1
x.max <- 20
y.labels <- c("Selected Dark Orthogroups", "Orther Orthogroups")
names(y.labels) <- c("Yes", "No")
p3 <- data.seqs %>%
select(!!sym(col), selected) %>%
rename(value = !!sym(col)) %>%
rename(group = selected) %>%
filter(! is.na(value)) %>%
mutate(value = if_else(value < x.min, x.min,
if_else(value > x.max, x.max, value)
)
) %>%
ggplot(aes(x=value, y=group, fill=group)) +
geom_density_ridges(scale = 0.95) +
theme_ridges() +
theme(legend.position = "none",
plot.title = element_text(size=14*text.scale, hjust = 0.5, face="bold"),
axis.title.x = element_text(size=10*text.scale, hjust = 0.5, face="bold"),
axis.title.y = element_blank(),
axis.text.x = element_text(size=8*text.scale),
axis.text.y = element_text(size=8*text.scale, angle = 90, vjust = 0.5, hjust=0, face="bold")
) +
labs(title="Distribution of CDS/exon count per gene") +
scale_fill_cyclical(values = c("#ff7f00", "#984ea3")) +
labs(x=col.name) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0), label=y.labels)
p3
## Picking joint bandwidth of 0.195